Data source: https://www.kaggle.com/harlfoxem/housesalesprediction
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('ggplot') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
SEED=100
np.random.seed(SEED)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.options.display.float_format = '{:,}'.format # df.A.value_counts().astype(float)
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import IPython
from IPython.display import display, HTML, Image, Markdown
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
%%capture
# capture will not print in notebook
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
## mount google drive
# from google.colab import drive
# drive.mount('/content/drive')
# dat_dir = 'drive/My Drive/Colab Notebooks/data/'
# sys.path.append(dat_dir)
## Image dir
# img_dir = 'drive/My Drive/Colab Notebooks/images/'
# if not os.path.isdir(img_dir): os.makedirs(img_dir)
# sys.path.append(img_dir)
## Output dir
# out_dir = 'drive/My Drive/Colab Notebooks/outputs/'
# if not os.path.isdir(out_dir): os.makedirs(out_dir)
# sys.path.append(out_dir)
## Also install my custom module
# module_dir = 'drive/My Drive/Colab Notebooks/Bhishan_Modules/'
# sys.path.append(module_dir)
# !cd drive/My Drive/Colab Notebooks/Bhishan_Modules/
# !pip install -e bhishan
# !cd -
## pip install
#!pip install pyldavis
# !pip install catboost
#!pip install category_encoders # TargetEncoder
# hyper parameter tuning
# !pip install hyperopt
#!pip install optuna # hyper param opt
# model evaluation
# !pip install shap
!pip install eli5
!pip install lime
!pip install duecredit # forestci needs this
!pip install forestci # fci.random_forest_error(model_rf, Xtrain,Xtest)
!pip install dtreeviz # decision tree viz
# faster pandas
# df['x'].swifter.apply(myfunc)
# df[['x','y']].swifter.apply(myfunc,pos_arg,keyword_arg=mykeyword_arg)
# !pip install swifter
# update modules
# !pip install -U scikit-learn # we need restart
import sklearn
# update pandas profiling
# profile = df.profile_report(style={'full_width':True})
# profile.to_file(output_file="output.html")
!pip install -U pandas-profiling # we need restart
import pandas_profiling
# Note: We need to restart kernel to use tqdm
# from tqdm.notebook import trange, tqdm
# tqdm.pandas()
# out = df['A'].progress_apply(myfunc)
!pip install -U tqdm
# print
print('Environment: Google Colaboratory.')
# NOTE: If we update modules in gcolab, we need to restart runtime.
import forestci as fci
!pip freeze | grep prof
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import pydotplus
from IPython.display import Image
import tqdm
import eli5
import lime
import yellowbrick
from eli5 import show_weights
from eli5 import show_prediction
import lime.lime_tabular
ifile = "https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/processed/data_cleaned.csv?raw=true"
df = pd.read_csv(ifile)
print(df.shape)
df['price'] = df['price']/1e6 # millions
df.head().T
df.columns.to_numpy()
df.dtypes
df.drop(['date','zipcode_top10'],axis=1,inplace=True)
fig, ax = plt.subplots(figsize=(8,6));
sns.distplot(df['price'], kde=False, ax=ax);
ax.set_ylabel("Counts");
ax.set_xlabel("Price")
ax.set_title("Home Price ");
# distribution is non-gaussian with right tail
col = 'sqft_living'
g = sns.jointplot(df[col], df['price'], kind='hex',cmap='Greens')
g.ax_marg_x.set_title(f"House Price vs {col}");
g.set_axis_labels(xlabel=f'{col}', ylabel='Price (Millions)');
col = 'bedrooms'
g = sns.jointplot(df[col], df['price'], kind='hex')
g.ax_marg_x.set_title(f"House Price vs {col}");
g.set_axis_labels(xlabel=f'{col}', ylabel='Price (Millions)');
# very few houses have more than 5 bedrooms
col = 'age'
g = sns.jointplot(df[col], df['price'], kind='hex',cmap='Blues')
g.ax_marg_x.set_title(f"House Price vs {col}");
g.set_axis_labels(xlabel=f'{col}', ylabel='Price (Millions)');
# when age increase, price decreases
df.columns.to_numpy()
time_start = time.time()
myfeatures = ['price','bedrooms','bathrooms','sqft_living','view','condition']
sns.pairplot(df[myfeatures])
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
plt.show()
corr = df[['price', 'bedrooms', 'bathrooms',
'sqft_living','age','lat','long']].corr('spearman')
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
cmap = sns.color_palette("RdBu", 7)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, vmax=1.0, vmin=-1.0, center=0, cmap=cmap,
square=True, linewidths=.5, cbar_kws={"shrink": .5});
df.profile_report(style={'full_width':True})
df.head(2)
target = 'price'
df_Xtrain, df_Xtest, ser_ytrain, ser_ytest = sklearn.model_selection.train_test_split(df.drop(target, axis=1),
df[target],
train_size=0.8,
random_state=SEED)
Xtrain = df_Xtrain.to_numpy()
ytrain = ser_ytrain.to_numpy().ravel()
Xtest = df_Xtest.to_numpy()
ytest = ser_ytest.to_numpy().ravel()
df_Xtrain.head()
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(8,6));
bins=10
sns.distplot(ytrain, kde=False, ax=ax, label="train", bins=bins);
sns.distplot(ytest, kde=False, ax=ax, label="test", bins=bins);
ax.legend();
ax.set_ylabel('Count')
ax.set_xlabel('Price (Million)')
ax.set_title("Train mean {:0.2f},\
Test mean {:0.2f}".format(ytrain.mean(), ytest.mean()));
plt.show()
model = sklearn.ensemble.RandomForestRegressor(n_estimators=100,
random_state=SEED)
print("Sizes for train {}, test {}".format(Xtrain.shape, Xtest.shape))
model.fit(Xtrain, ytrain)
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(ytrain, model.predict(Xtrain))
mse_test = mean_squared_error(ytest, model.predict(Xtest))
print(f'Random Forest mean-squared error on train set: {mse_train:.5f}')
print(f'Random Forest mean-squared error on test set: {mse_test:.5f}')
from yellowbrick.regressor import PredictionError, ResidualsPlot
# resuduals vs predicted values
fig, ax = plt.subplots(figsize=(8,6));
visualizer = ResidualsPlot(model, ax=ax)
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
g = visualizer.poof()
ser_test_resids = pd.Series(model.predict(Xtest) - ytest)
ax = ser_test_resids.plot(kind="hist", bins=10,
title="Residuals on Predicted", color='g', alpha=1);
plt.show()
# residuals vs true values
dfx = pd.DataFrame({'Residual': ser_test_resids.to_numpy(), 'Truth': ytest})
fig, ax = plt.subplots(figsize=(8,6));
dfx.plot(kind="scatter", x='Truth', y='Residual', ax=ax,
title="Residual vs Truth");
# if predictions are close to truth, we get 45 deg line
fig, ax = plt.subplots(figsize=(8,6));
visualizer = PredictionError(model, ax=ax)
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
g = visualizer.poof()
References:
import forestci as fci
V_IJ_unbiased = fci.random_forest_error(model, Xtrain, Xtest)
ypreds = model.predict(Xtest)
fig, ax = plt.subplots(figsize=(8,6));
ax.errorbar(ytest, ypreds, yerr=np.sqrt(V_IJ_unbiased), fmt='o');
ax.set(title="Truth vs Predicted from test set with an estimate of variance on the error bars",
xlabel="ytest (truth)",
ylabel="ypreds (prediction)");
plt.plot([0, 50], [0, 50], '--', label="Identity");
plt.legend();
fig, ax = plt.subplots(figsize=(8,6));
ax.scatter(ypreds, V_IJ_unbiased);
ax.set(title="Regressed values vs unbiased variance estimate",
xlabel="ytest",
ylabel="Variance Infinite Jackknife Unbiased");
# If we build estimators on bootstrap samples of the original training set and
# test them all on the same test set, we can see
# if certain parts of the regression range are more variable.
# This might suggest that our original training data isn't very good
# at properly defining these parts of the regression range.
# fixed test set, subsampled training set
# need to identify where the most variance exists
n_items = Xtrain.shape[0]
N_BOOTSTRAP_MODELS = 20
ser_ypreds_subsample = pd.DataFrame({'price': ytest})
import copy
model_subsample = copy.copy(model)
for n in range(N_BOOTSTRAP_MODELS):
train_mask = np.random.uniform(0, n_items, int(n_items * 0.9)).astype(np.int_)
df_Xtrain_masked = df_Xtrain.iloc[train_mask]
ser_ytrain_masked = ser_ytrain.iloc[train_mask]
model_subsample.fit(df_Xtrain_masked, ser_ytrain_masked)
mse_train = mean_squared_error(ser_ytrain_masked, model_subsample.predict(df_Xtrain_masked))
ypreds = model_subsample.predict(Xtest)
ser_ypreds_subsample[f'prediction_{n}'] = ypreds
mse_test = mean_squared_error(ytest, ypreds)
fig, ax = plt.subplots()
ser_ypreds_subsample = ser_ypreds_subsample.sort_values(target).reset_index(drop=True)
ser_ypreds_subsample.drop([target], axis=1).plot(ax=ax, legend=False);
ser_ypreds_subsample[target].plot(ax=ax, label=target, legend=True)
ax.set(title="{} models from subsampled training data,\
predicting on the same test data\nsorted by increasing price".format(N_BOOTSTRAP_MODELS));
plt.show()
ser_ypreds_subsample.head()
# we do not have much variance
fig, ax = plt.subplots()
ser_ypreds_subsample[target].plot(ax=ax, secondary_y=False,
label=target, legend=True)
ser_ypreds_subsample.drop([target], axis=1).var(axis=1).plot(
ax=ax, label="Variance", legend=True, secondary_y=True);
ax.set(title="Variance of subsampled models on the same test data, by Price");
example_to_explain_idx = 14
example_to_explain = df_Xtest.iloc[example_to_explain_idx]
example_to_explain_true_answer = ser_ytest.iloc[example_to_explain_idx]
feature_names = df.columns.drop(target)
print(f"Explaining the {example_to_explain_idx}th row from the testing set")
print("The answer we're looking for is: ", example_to_explain_true_answer)
print("The predicted answer is:", float(model.predict(example_to_explain.values.reshape(-1, 1).T)))
print("The input data X is: ")
pd.DataFrame(example_to_explain)
df_Xtrain.apply(pd.Series.nunique).loc[lambda x: x <10]
categorical_features = df_Xtrain.apply(pd.Series.nunique).loc[lambda x: x <10].index.to_list()
categorical_features
categorical_features_idx = [df_Xtrain.columns.get_loc(x) for x in categorical_features]
categorical_features_idx
feature_names = df_Xtrain.columns.to_list()
categorical_features = categorical_features_idx
explainer = lime.lime_tabular.LimeTabularExplainer(Xtrain,
feature_names=feature_names,
class_names=['price'],
categorical_features=categorical_features_idx,
verbose=True,
mode='regression')
# Lime Uses perturbed data neighborhood_data and neighborhood_labels
# Intercept is the generated linear model's intercept
# Prediction_local is the predicted output from the linear model
# Right is the predicted value from the explained regressor (not LIME's linear model)
exp = explainer.explain_instance(example_to_explain, model.predict, num_features=10)
exp.show_in_notebook(show_table=True)
exp.as_pyplot_figure()
# note that the double-plot is a bug: https://github.com/marcotcr/lime/issues/89
lst = exp.as_list()
lst
pd.DataFrame(lst)
import eli5
from eli5 import show_weights
from eli5 import show_prediction
print("BIAS is the mean of the training data (i.e. a guess prior to using any features):", ytrain.mean())
model
feature_names = df_Xtrain.columns.to_list()
show_prediction(model,
example_to_explain,
feature_names=feature_names,
show_feature_values=True)
df_imp = pd.DataFrame({'feature_importances_': model.feature_importances_}, index=feature_names)
df_imp.sort_values(by="feature_importances_", ascending=False).head(10).round(4)
# using eli5 importance visualizer
# Note that the +/- value assumes a Gaussian and the boxplot below shows that this isn't true
show_weights(model, feature_names=feature_names)
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model).fit(Xtest, ytest)
eli5.show_weights(perm, feature_names=feature_names)
feature_names = df_Xtrain.columns.to_list()
df_imp = pd.DataFrame()
for est_idx, est_tree in enumerate(model.estimators_):
df_imp["tree_{}".format(est_idx)] = est_tree.feature_importances_
df_imp.index = feature_names
df_imp = df_imp.T
sorted_index = df_imp.mean(axis=0).sort_values().index
fig, ax = plt.subplots(figsize=(8,6));
df_imp[sorted_index].plot(kind="box", vert=False, ax=ax, title="Feature importance distributions");
ax.set_xlabel("Importance")
# remove right/top border to make things slightly neater
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
# visual tidy-up to make left axis small values slightly easier to read
# offset left and bottom axis
ax.spines['bottom'].set_position(('axes', -0.05))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('axes', -0.05))
fig, ax = plt.subplots(figsize=(8,6));
sns.stripplot(data=df_imp[sorted_index[::-1]], jitter=0.05, orient="h", ax=ax, edgecolor="k", linewidth=1);
sns.boxplot(data=df_imp[sorted_index[::-1]], orient="h", ax=ax);
ax.set_title("Feature importance distributions");
ax.set_xlabel("Importance");
show_weights(model, feature_names=feature_names, show="description")
est_depth_3 = sklearn.ensemble.RandomForestRegressor(n_estimators=10,
max_depth=3,
random_state=0)
print("Sizes for train {}, test {}".format(Xtrain.shape, Xtest.shape))
est_depth_3.fit(Xtrain, ytrain)
est_tree0 = est_depth_3.estimators_[0]
show_weights(est_tree0, feature_names=feature_names)
est_tree1 = est_depth_3.estimators_[1]
show_weights(est_tree1, feature_names=feature_names)
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10)
model.fit(Xtrain,ytrain)
last_decision_tree = model.estimators_[-1]
last_decision_tree
# Export as dot file
export_graphviz(last_decision_tree, out_file='tree.dot',
feature_names = feature_names,
class_names = target,
rounded = True,
proportion = False,
precision = 2,
filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
import pydotplus
# Create DOT data
dot_data = sklearn.tree.export_graphviz(last_decision_tree, out_file=None,
feature_names=feature_names,
class_names=target)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())
# Create PDF
graph.write_pdf("forest.pdf")
# Create PNG
graph.write_png("forest.png")
import dtreeviz
Xtrain.shape, len(feature_names), target
df_Xtrain.head(2)
from dtreeviz.trees import rtreeviz_univar
fig = plt.figure()
ax = fig.gca()
t = rtreeviz_univar(ax,
df_Xtrain['sqft_living'], ytrain,
max_depth=2,
feature_name='Square foot Living',
target_name='price',
fontsize=14)
plt.show()
from mpl_toolkits.mplot3d import Axes3D
from dtreeviz.trees import rtreeviz_bivar_3D
features = ['sqft_living', 'bedrooms']
figsize = (6,5)
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111, projection='3d')
t = rtreeviz_bivar_3D(ax,
df_Xtrain[features],ytrain,
max_depth=4,
feature_names=['Sqft Living', 'Bedrooms'],
target_name='price',
fontsize=14,
elev=20,
azim=25,
dist=8.2,
show={'splits','title'})
plt.show()
from dtreeviz.trees import rtreeviz_bivar_heatmap
figsize = (6, 5)
fig, ax = plt.subplots(1, 1, figsize=figsize)
t = rtreeviz_bivar_heatmap(ax,
df_Xtrain[features],ytrain,
max_depth=4,
feature_names=['Sqft Living', 'Bedrooms'],
fontsize=14)
plt.show()
last_decision_tree
# feature_names
from dtreeviz.trees import viz_leaf_target
dtr = DecisionTreeRegressor(max_depth=3, random_state=SEED)
dtr.fit(df_Xtrain,ytrain)
viz_leaf_target(dtr,
df_Xtrain,ytrain,
feature_names,
target,
show_leaf_labels=True,
grid=False,
figsize=(4,12))
from dtreeviz.trees import viz_leaf_samples
viz_leaf_samples(dtr, figsize=(20,10))
viz_leaf_samples(dtr, display_type="text")
#Useful when you want to easily see the general distribution of leaf samples.
viz_leaf_samples(dtr, display_type="hist", bins=30, figsize=(20,7))